The transcription factors (TF) and DNA recognitions depend on multiples levels of interactions. The first level involves chromatin accessibility, where nuclesosome-depleted regions are highly associated with TFs binding compared to the closed chromatin, which is often inaccesible to most TFs. The second represents the existence of the consensus binding motif in the DNA sequence, a point that we are going to identify in this notebook in each cell type using Signac and ChromVar.
library(Seurat)
library(Signac)
library(reshape)
library(ggplot2)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(patchwork)
library(chromVAR)
library(motifmatchr)
library(ggpubr)
library(data.table)
library(chromVARmotifs)
library(dplyr)
library(purrr)
library(readxl)
library(writexl)
library(pheatmap)
library(factoextra)
library(corrplot)
set.seed(1234)
cell_type = "CD4_T"
# Paths
path_to_obj <- paste0(
here::here("scATAC-seq/results/R_objects/level_5/"),
cell_type,
"/04.",
cell_type,
"_integration_peak_calling_level_5.rds",
sep = ""
)
color_palette <- c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D",
"#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0",
"#1C8356", "#FEAF16", "#822E1C", "#C4451C",
"#1CBE4F", "#325A9B", "#F6222E", "#FE00FA",
"#FBE426", "#16FF32", "black", "#3283FE",
"#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
"#90AD1C", "#FE00FA", "#85660D", "#3B00FB",
"#822E1C", "coral2", "#1CFFCE", "#1CBE4F",
"#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
"#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
"#FEAF16", "#683B79", "#B10DA1", "#1C7F93",
"#F8A19F", "dark orange", "#FEAF16",
"#FBE426", "Brown")
path_to_save <- paste0(
here::here("scATAC-seq/results/R_objects/level_5/"),
cell_type,
"/05.",
cell_type,
"_chromVar_JASPAR_level_5.rds",
sep = ""
)
path_to_save_TF_motifs <- paste0(
here::here("scATAC-seq/results/files/"),
cell_type,
"/",
cell_type,
"_chromVar_JASPAR_level_5.xlsx",
sep = ""
)
remove_correlated_helper <- function(mat, val, cutoff = 0.9) {
stopifnot(nrow(mat) == length(val))
cormat <- cor(t(mat), use = "pairwise.complete.obs")
diag(cormat) <- NA
keep <- seq_len(nrow(mat))
for (i in order(val, decreasing = TRUE)) {
if (i %in% keep) {
toremove <- which(cormat[keep, i] >= cutoff)
if (length(toremove) > 0)
keep <- keep[-toremove]
}
}
return(keep)
}
seurat <- readRDS(path_to_obj)
seurat
## An object of class Seurat
## 93116 features across 16383 samples within 1 assay
## Active assay: peaks_level_5 (93116 features, 92407 variable features)
## 3 dimensional reductions calculated: umap, lsi, harmony
DimPlot(
seurat,
group.by = "annotation_paper",
cols = color_palette,
pt.size = 0.1
)
DimPlot(
seurat,
split.by = "annotation_paper",
cols = color_palette,
pt.size = 0.1,
ncol = 5
) + NoLegend()
JASPAR is a collection of transcription factor DNA-binding preferences, modeled as matrices. To have a detaill explanation of it, visit the following link, http://jaspar.genereg.net/about/.
opts <- list()
opts[["tax_group"]] <- "vertebrates"
pfm <- getMatrixSet(JASPAR2020, opts)
length(pfm)
## [1] 746
seurat <- AddMotifs(
object = seurat,
genome = BSgenome.Hsapiens.UCSC.hg38,
pfm = pfm
)
seurat[["peaks_level_5"]]
## ChromatinAssay data with 93116 features for 16383 cells
## Variable features: 92407
## Genome:
## Annotation present: TRUE
## Motifs present: TRUE
## Fragment files: 9
The TF motif enrichments (that help us to predict potential specific cell-type regulators) previously computed are not calculated per-cell and they do not take into account the insertion sequence bias of the Tn5 transposase. To account for these issues we can use chromVAR that computes for each motif annotation and each cell a bias corrected “desviation” in accessibility from a expected accessibility based on the average of all the cells. This allows us to visualize motif activities per cell, and also provides an alternative method of identifying differentially-active motifs between cell types.
# The RunChromVAR function retrieved the deviationScores, the Z-scores for each bias corrected deviations.
seurat <- RunChromVAR(
object = seurat,
genome = BSgenome.Hsapiens.UCSC.hg38
)
saveRDS(seurat, path_to_save)
avgexpr_mat <- AverageExpression(
seurat,
assays = "chromvar",
return.seurat = F,
group.by = "ident",
slot = "data")
res.pca <- prcomp(t(avgexpr_mat$chromvar),scale. = T)
options(repr.plot.width=6, repr.plot.height=8)
fviz_pca_ind(res.pca,
repel = TRUE)
pheatmap(avgexpr_mat$chromvar, scale = "row",
border_color = "black",
cluster_rows = T,
cluster_cols = T,
fontsize_row= 3,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
cutree_rows = NA,
cutree_cols = 2)
We compute the standar desviation for each annotated motif. Then, we select motifs that have a standar desviation higher than a specific theshold and use them to perform a correlation test and a principal component analysis. Note, that we are going to use the function “remove_correlated_helper”, to eliminate the variables that present a correlation greater than 0.9.
seurat_average <- AverageExpression(
seurat,
assays = "chromvar",
return.seurat = T,
group.by = "ident")
threshold = 1
matrix <- seurat_average[["chromvar"]]@data
vars <- matrixStats::rowSds(matrix, na_rm = TRUE)
boxplot(vars)
abline(h=1)
ix <- which(vars >= threshold)
ix2 <- ix[remove_correlated_helper(matrix[ix, , drop = FALSE],
vars[ix])]
cormat <- cor(matrix[ix2,],
use = "pairwise.complete.obs")
corrplot(cormat, type="upper", order="hclust", tl.col="black", tl.srt=45)
pc_res <- prcomp(t(matrix[ix2, ]))
fviz_pca_ind(pc_res, repel = TRUE)
DefaultAssay(seurat) <- 'chromvar'
Idents(seurat) <- seurat$annotation_paper
da_regions <- FindAllMarkers(
object = seurat,
only.pos = TRUE,
min.pct = 0.1,
test.use = 'LR',
latent.vars = 'nCount_peaks_level_5'
)
motif_name <- sapply(da_regions$gene, function(x) {name(getMatrixByID(JASPAR2020, ID = x))})
da_regions$motif_name <- motif_name
families <- sapply(da_regions$gene, function(x) {tags(getMatrixByID(JASPAR2020, ID = x))$family})
da_regions$family <- families
da_regions$family <- gsub('\\s+', '',da_regions$family)
DT::datatable(da_regions)
da_regions_selected <- (da_regions[da_regions$p_val_adj < 0.005 &
da_regions$avg_log2FC > 0.5, ])
We transform the motif ID in motif name and also we add the family associated with it.
output <- split(da_regions, da_regions$cluster)
names(output) <- c("Naive", "CM Pre-non-Tfh", "CM PreTfh", "T-Trans-Mem",
"T-Eff-Mem", "T-helper", "Tfh T-B border", "Tfh-LZ-GC",
"GC-Tfh-SAP", "GC-Tfh-0X40", "Tfh-Mem","Eff-Tregs","non-GC-Tf-regs","GC-Tf-regs")
write_xlsx(output, path_to_save_TF_motifs)
correlation <- data.frame(table(da_regions_selected$cluster),
table(seurat$annotation_paper))[,c(1,2,4)]
colnames(correlation) <- c("cluster","n_motifs","n_cells")
ggscatter(correlation, x = "n_cells",
y = "n_motifs",
add = "reg.line", # Add regression line
conf.int = TRUE,
label = "cluster",
font.label = c(9, "bold", "black"),
add.params = list(color = "blue",
fill = "lightgray")
)+stat_cor(method = "pearson",
label.x = 3,
label.y = 500) # Add correlation coefficient
da_regions_selected_sorted <- da_regions_selected[with(da_regions_selected,
order(cluster, -avg_log2FC)), ]
da_regions_selected_sorted$rank <- ave(da_regions_selected_sorted$avg_log2FC,
da_regions_selected_sorted$cluster,
FUN = seq_along)
da_regions_selected_sorted_prepared <- da_regions_selected_sorted %>%
group_by(cluster) %>% top_n(20,-rank)
ggplot(da_regions_selected_sorted_prepared,
aes(x = rank,
y = avg_log2FC,
color = avg_log2FC)) +
geom_point(size = 1) +
ggrepel::geom_label_repel(
data = da_regions_selected_sorted_prepared,
aes(y = avg_log2FC, label = motif_name),
size = 4,
nudge_x = 2,
color = "black"
) + theme_minimal() + facet_wrap(~ cluster, ncol = 3)
We can plot the motifs group by family.
ggbarplot(da_regions_selected_sorted_prepared, "family", "avg_log2FC",
fill = "cluster",width = 0.5,
size = 0.1) + theme(legend.position = "none",
text = element_text(size=12),
axis.text.x = element_text(angle=90, hjust=1),
strip.text.x = element_text(size = 7, colour = "black")) +
facet_grid(~cluster, scales = "free_x")
name <- "GC-Tfh-SAP"
specific_cluster = da_regions_selected_sorted_prepared[da_regions_selected_sorted_prepared$cluster == name,]
ggbarplot(specific_cluster, "motif_name", "avg_log2FC",
fill = "family",width = 0.5,
size = 0.1) + theme(legend.position = "none",
text = element_text(size=10),
axis.text.x = element_text(angle=90, hjust=1),
strip.text.x = element_text(size = 7, colour = "black")) +
facet_grid(~cluster ~family, scales = "free_x")
specific_cluster_sorted <- specific_cluster[order(specific_cluster$rank),]
FeaturePlot(
object = seurat,
features = specific_cluster_sorted$gene[1:4],
min.cutoff = 'q5',
max.cutoff = 'q95',
pt.size = 0.1,
ncol = 2
)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] corrplot_0.84 factoextra_1.0.7.999 pheatmap_1.0.12 writexl_1.4.0 readxl_1.3.1 purrr_0.3.4 dplyr_1.0.7 chromVARmotifs_0.2.0 data.table_1.14.0 ggpubr_0.4.0 motifmatchr_1.10.0 chromVAR_1.10.0 patchwork_1.1.1 BSgenome.Hsapiens.UCSC.hg38_1.4.3 BSgenome_1.56.0 rtracklayer_1.48.0 Biostrings_2.56.0 XVector_0.28.0 GenomicRanges_1.40.0 GenomeInfoDb_1.24.2 IRanges_2.22.1 S4Vectors_0.26.0 BiocGenerics_0.34.0 TFBSTools_1.26.0 JASPAR2020_0.99.10 ggplot2_3.3.5 reshape_0.8.8 Signac_1.2.1 SeuratObject_4.0.2 Seurat_4.0.3 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.1 reticulate_1.20 R.utils_2.10.1 tidyselect_1.1.1 poweRlaw_0.70.6 RSQLite_2.2.1 AnnotationDbi_1.50.3 htmlwidgets_1.5.3 grid_4.0.3 docopt_0.7.1 BiocParallel_1.22.0 Rtsne_0.15 munsell_0.5.0 codetools_0.2-17 ica_1.0-2 DT_0.16 future_1.21.0 miniUI_0.1.1.1 withr_2.4.2 colorspace_2.0-2 Biobase_2.48.0 knitr_1.30 ROCR_1.0-11 ggsignif_0.6.0 tensor_1.5 listenv_0.8.0 labeling_0.4.2 slam_0.1-47 GenomeInfoDbData_1.2.3 polyclip_1.10-0 bit64_4.0.5 farver_2.1.0 rprojroot_2.0.2 parallelly_1.26.1 vctrs_0.3.8 generics_0.1.0 xfun_0.18 lsa_0.73.2 ggseqlogo_0.1 R6_2.5.0 bitops_1.0-7 spatstat.utils_2.2-0
## [43] DelayedArray_0.14.0 assertthat_0.2.1 promises_1.2.0.1 scales_1.1.1 gtable_0.3.0 globals_0.14.0 goftest_1.2-2 seqLogo_1.54.3 rlang_0.4.11 RcppRoll_0.3.0 splines_4.0.3 rstatix_0.6.0 lazyeval_0.2.2 broom_0.7.2 spatstat.geom_2.2-0 BiocManager_1.30.10 yaml_2.2.1 reshape2_1.4.4 abind_1.4-5 crosstalk_1.1.1 backports_1.1.10 httpuv_1.6.1 tools_4.0.3 bookdown_0.21 nabor_0.5.0 ellipsis_0.3.2 spatstat.core_2.2-0 RColorBrewer_1.1-2 ggridges_0.5.3 Rcpp_1.0.6 plyr_1.8.6 zlibbioc_1.34.0 RCurl_1.98-1.2 rpart_4.1-15 deldir_0.2-10 pbapply_1.4-3 cowplot_1.1.1 zoo_1.8-9 haven_2.3.1 SummarizedExperiment_1.18.1 ggrepel_0.9.1 cluster_2.1.0
## [85] here_1.0.1 magrittr_2.0.1 scattermore_0.7 openxlsx_4.2.4 lmtest_0.9-38 RANN_2.6.1 SnowballC_0.7.0 fitdistrplus_1.1-5 matrixStats_0.59.0 hms_0.5.3 mime_0.11 evaluate_0.14 xtable_1.8-4 XML_3.99-0.3 rio_0.5.16 sparsesvd_0.2 gridExtra_2.3 compiler_4.0.3 tibble_3.1.2 KernSmooth_2.23-17 crayon_1.4.1 R.oo_1.24.0 htmltools_0.5.1.1 mgcv_1.8-33 later_1.2.0 tidyr_1.1.3 DBI_1.1.0 tweenr_1.0.1 MASS_7.3-53 car_3.0-10 Matrix_1.3-4 readr_1.4.0 R.methodsS3_1.8.1 igraph_1.2.6 forcats_0.5.0 pkgconfig_2.0.3 GenomicAlignments_1.24.0 TFMPvalue_0.0.8 foreign_0.8-80 plotly_4.9.4.1 spatstat.sparse_2.0-0 annotate_1.66.0
## [127] DirichletMultinomial_1.30.0 stringr_1.4.0 digest_0.6.27 sctransform_0.3.2 RcppAnnoy_0.0.18 pracma_2.2.9 CNEr_1.24.0 spatstat.data_2.1-0 cellranger_1.1.0 rmarkdown_2.5 leiden_0.3.8 fastmatch_1.1-0 uwot_0.1.10 curl_4.3.2 shiny_1.6.0 Rsamtools_2.4.0 gtools_3.9.2 lifecycle_1.0.0 nlme_3.1-150 jsonlite_1.7.2 carData_3.0-4 viridisLite_0.4.0 fansi_0.5.0 pillar_1.6.1 lattice_0.20-41 KEGGREST_1.28.0 fastmap_1.1.0 httr_1.4.2 survival_3.2-7 GO.db_3.11.4 glue_1.4.2 zip_2.1.1 qlcMatrix_0.9.7 png_0.1-7 bit_4.0.4 ggforce_0.3.2 stringi_1.6.2 blob_1.2.1 caTools_1.18.2 memoise_1.1.0 irlba_2.3.3 future.apply_1.7.0